In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import plotly.express as px
import matplotlib.pyplot as plt
import os
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
In [2]:
marvel=pd.read_csv("Marvel_Movies.csv")
marvel
Out[2]:
name ID ALIGN EYE HAIR SEX GSM ALIVE APPEARANCES FIRST APPEARANCE Year
0 Spider-Man (Peter Parker) Secret Identity Good Characters Hazel Eyes Brown Hair Male Characters NaN Living Characters 4043.0 Aug-62 1962.0
1 Captain America (Steven Rogers) Public Identity Good Characters Blue Eyes White Hair Male Characters NaN Living Characters 3360.0 Mar-41 1941.0
2 Wolverine (James \"Logan\" Howlett) Public Identity Neutral Characters Blue Eyes Black Hair Male Characters NaN Living Characters 3061.0 Oct-74 1974.0
3 Iron Man (Anthony \"Tony\" Stark) Public Identity Good Characters Blue Eyes Black Hair Male Characters NaN Living Characters 2961.0 Mar-63 1963.0
4 Thor (Thor Odinson) No Dual Identity Good Characters Blue Eyes Blond Hair Male Characters NaN Living Characters 2258.0 Nov-50 1950.0
... ... ... ... ... ... ... ... ... ... ... ...
16371 Ru'ach (Earth-616) No Dual Identity Bad Characters Green Eyes No Hair Male Characters NaN Living Characters NaN NaN NaN
16372 Thane (Thanos' son) (Earth-616) No Dual Identity Good Characters Blue Eyes Bald Male Characters NaN Living Characters NaN NaN NaN
16373 Tinkerer (Skrull) (Earth-616) Secret Identity Bad Characters Black Eyes Bald Male Characters NaN Living Characters NaN NaN NaN
16374 TK421 (Spiderling) (Earth-616) Secret Identity Neutral Characters NaN NaN Male Characters NaN Living Characters NaN NaN NaN
16375 Yologarch (Earth-616) NaN Bad Characters NaN NaN NaN NaN Living Characters NaN NaN NaN

16376 rows × 11 columns

In [3]:
marvel.shape
Out[3]:
(16376, 11)
In [4]:
marvel.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16376 entries, 0 to 16375
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              16376 non-null  object 
 1   ID                12606 non-null  object 
 2   ALIGN             13564 non-null  object 
 3   EYE               6609 non-null   object 
 4   HAIR              12112 non-null  object 
 5   SEX               15522 non-null  object 
 6   GSM               90 non-null     object 
 7   ALIVE             16373 non-null  object 
 8   APPEARANCES       15280 non-null  float64
 9   FIRST APPEARANCE  15561 non-null  object 
 10  Year              15561 non-null  float64
dtypes: float64(2), object(9)
memory usage: 1.4+ MB

Разделение реальных имён и ников/названий героев¶

In [5]:
nicknames=[]
names=[]
for name in marvel.name:
   match= re.search("\((?:(?!\))(?:.|\n))*\)",name)
   if match:
    nickname=name[match.start()+1:match.end()-1]
    newname=name.replace(name[match.start():match.end()],"")
    names.append(newname)
    nicknames.append(nickname)
   else:
    names.append(np.nan)
    nicknames.append(np.nan)

marvel['nicknames']=names
marvel['name']=nicknames
marvel
Out[5]:
name ID ALIGN EYE HAIR SEX GSM ALIVE APPEARANCES FIRST APPEARANCE Year nicknames
0 Peter Parker Secret Identity Good Characters Hazel Eyes Brown Hair Male Characters NaN Living Characters 4043.0 Aug-62 1962.0 Spider-Man
1 Steven Rogers Public Identity Good Characters Blue Eyes White Hair Male Characters NaN Living Characters 3360.0 Mar-41 1941.0 Captain America
2 James \"Logan\" Howlett Public Identity Neutral Characters Blue Eyes Black Hair Male Characters NaN Living Characters 3061.0 Oct-74 1974.0 Wolverine
3 Anthony \"Tony\" Stark Public Identity Good Characters Blue Eyes Black Hair Male Characters NaN Living Characters 2961.0 Mar-63 1963.0 Iron Man
4 Thor Odinson No Dual Identity Good Characters Blue Eyes Blond Hair Male Characters NaN Living Characters 2258.0 Nov-50 1950.0 Thor
... ... ... ... ... ... ... ... ... ... ... ... ...
16371 Earth-616 No Dual Identity Bad Characters Green Eyes No Hair Male Characters NaN Living Characters NaN NaN NaN Ru'ach
16372 Thanos' son No Dual Identity Good Characters Blue Eyes Bald Male Characters NaN Living Characters NaN NaN NaN Thane (Earth-616)
16373 Skrull Secret Identity Bad Characters Black Eyes Bald Male Characters NaN Living Characters NaN NaN NaN Tinkerer (Earth-616)
16374 Spiderling Secret Identity Neutral Characters NaN NaN Male Characters NaN Living Characters NaN NaN NaN TK421 (Earth-616)
16375 Earth-616 NaN Bad Characters NaN NaN NaN NaN Living Characters NaN NaN NaN Yologarch

16376 rows × 12 columns

Статистика персонажей по полу¶

In [6]:
marvel['SEX'] = marvel['SEX'].fillna("Не определён")
gendre=marvel.SEX.value_counts()
px.bar(gendre,text_auto='.4s', labels={'index':'Genders','variable':'Quantity'})
In [7]:
marvel.SEX.unique()
Out[7]:
array(['Male Characters', 'Female Characters', 'Genderfluid Characters',
       'Agender Characters', 'Не определён'], dtype=object)
In [8]:
len(marvel[marvel['SEX']=="Male Characters"])/len(marvel) * 100
genderValues = marvel.SEX.unique()
total = marvel['SEX'].value_counts().sum()
print("Процентаж персонажей по полу")
gendrePercentage = pd.Series(dtype='float64')
for v in genderValues:
    p = len(marvel[marvel['SEX']==v])/len(marvel) * 100
    gendrePercentage = pd.concat([gendrePercentage, pd.Series(data={v: p}, index=[v])])
    print(v + ": " + str(round(p,2)) + " %")

px.bar(gendrePercentage.sort_values(ascending=False),text_auto='.4s', labels={'index':'Genders','variable':'Percentage'})
Процентаж персонажей по полу
Male Characters: 71.07 %
Female Characters: 23.43 %
Genderfluid Characters: 0.01 %
Agender Characters: 0.27 %
Не определён: 5.21 %

Соотношение живых персонажей и поверженных + неизвестный статус(?)¶

In [9]:
marvel.ALIVE.unique()
Out[9]:
array(['Living Characters', 'Deceased Characters', nan], dtype=object)
In [10]:
print("ALIVE - " + str(len(marvel[marvel['ALIVE']=="Living Characters"])/len(marvel)))
print("Deceased Characters - " + str(len(marvel[marvel['ALIVE']=="Deceased Characters"])/len(marvel)))
print("Unknown status - " + str(len(marvel[marvel['ALIVE'].isnull()])/len(marvel)))
ALIVE - 0.7699071812408402
Deceased Characters - 0.22990962383976551
Unknown status - 0.00018319491939423546

Появление количества новых персонажей по году¶

In [11]:
years=marvel.Year.value_counts().sort_index()
px.line(years)

ТОП-10 персонажей, которые упоминались чаще всего¶

In [12]:
fig = px.bar(data_frame=marvel.head(10), x='nicknames', y='APPEARANCES', title='ТОП - 10 персонажей по популярности',
             color='APPEARANCES',
                 color_continuous_scale=["orange", "red",
                                         "green", "blue",
                                         "purple"])
fig.show()

Статистика по глазам¶

In [13]:
eye=marvel.EYE.value_counts().sort_values().head()
px.bar(eye,orientation='h')

Статистика по волосам¶

In [14]:
hair=marvel.HAIR.value_counts().sort_values(ascending=False)
px.bar(hair)

Статистика по внешности (сгруппированная)¶

In [15]:
marvel.groupby(['EYE','HAIR']).count().name.nlargest(10)
Out[15]:
EYE         HAIR      
Brown Eyes  Black Hair    824
            Brown Hair    655
Blue Eyes   Blond Hair    648
            Black Hair    391
            Brown Hair    294
Black Eyes  Black Hair    257
Red Eyes    No Hair       186
Green Eyes  No Hair       153
Brown Eyes  Bald          137
Blue Eyes   White Hair    132
Name: name, dtype: int64
In [16]:
dfg=marvel.groupby(['EYE','HAIR']).count().name.nlargest(10)
ax = dfg.unstack(level=0).plot(kind='bar', subplots=True, rot=40, figsize=(20, 20), layout=(6, 5))
plt.show()